Linear Regression fits a linear model to the data by adjusting a set of coefficients w to minimize the residual sum of squares between observed responses & prediction.
Linear model: $y=X\beta+\epsilon$
Objective function: $min_w \sum (Xw -y)^2$
Predictive model: $\hat{y}(w,x)=w_0 + w_1x_1+...+w_px_p$
Notation:
In [ ]:
%matplotlib notebook
In [ ]:
import os
import sklearn
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [ ]:
# Fixtures
GENDATA = os.path.join("data", "generated")
DATASET = "dataset{}.txt"
TARGET = "target{}.txt"
COEFS = "coefs{}.txt"
def load_gendata(suffix=""):
X = np.loadtxt(os.path.join(GENDATA, DATASET.format(suffix)))
y = np.loadtxt(os.path.join(GENDATA, TARGET.format(suffix)))
w = np.loadtxt(os.path.join(GENDATA, COEFS.format(suffix)))
return X,y,w
In [ ]:
X,y,w = load_gendata() # Sample data set
Xc,yc,wc = load_gendata("-collin") # Collinear data set
Xd,yd,wd = load_gendata("-demo") # Demo data set
In [ ]:
# Fix for 1D demo (for viz)
Xd = Xd.reshape(Xd.shape[0], 1)
In [ ]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(Xd, yd)
print(model)
print(model.coef_)
print(wd)
print(model.intercept_)
In [ ]:
def draw_model(X, y, model, w):
k = X.shape[1]
if k > 2 or k < 1:
raise ValueError("Cannot plot in more than 3D!")
# Determine if 2D or 3D
fig = plt.figure()
if k == 2:
ax = fig.add_subplot(111, projection='3d')
# Scatter plot of points
ax.scatter(X[:,0], X[:,1], y)
# Line plot of original model
xm, ym = np.meshgrid(np.linspace(X[:,0].min(), X[:,0].max()), np.linspace(X[:,1].min(), X[:,1].max()))
zm = w[0]*xm + w[1]*ym + bias
ax.plot_wireframe(xm, ym, zm, alpha=0.5, c='b')
# Line plot of predicted model
zp = model.predict(np.append(xm, ym))
ax.plot_wireframe(xm, ym, zp, alpha=0.5, c='g')
else:
ax = fig.add_subplot(111)
# Scatter plot of points
ax.scatter(X, y)
# Line plot of original model
Xm = np.linspace(X.min(), X.max())
Xm = Xm.reshape(Xm.shape[0], 1)
ym = np.dot(Xm, w)
ax.plot(Xm, ym, c='b')
# Line plot of predicted model
yp = model.predict(Xm)
ax.plot(Xm, yp, c='g')
return ax
draw_model(Xd, yd, model, wd)
In [ ]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y)
model = LinearRegression()
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("MSE: {:0.3f} | R2: {:0.3f}".format(mse, r2))
Vector Norm
Describes the length of the vector.
In [ ]:
X_train, X_test, y_train, y_test = tts(Xc, yc)
model = LinearRegression()
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model,mse, r2))
Prevent overfit/collinearity by penalizing the size of coefficients - minimize the penalized residual sum of squares:
Said another way, shrink the coefficients to zero.
$min_w (||Xw-y||_2)^2 + (\alpha||w||_2)^2$
Where 𝛼 > 0 is complexity parameter that controls shrinkage. The larger 𝛼, the more robust the model to collinearity.
Alpha influences the the bias/variance tradeoff: the larger the ridge alpha, the higher the bias and the lower the variance.
In [ ]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=0.1)
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model, mse, r2))
Reducing bias is one thing, but what if the coefficients are very sparse? E.g. the more dimensions we add, the more space goes into the model.
Lasso prefers fewer parameters attempting to reduce the number of variables the solution depends on.
$min_w \frac{1}{2n}(\sum{(Xw-w)^2+\alpha ||w||_1}$
In [ ]:
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.5)
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model, mse, r2))
Model trained with both L1 and L2 prior as regularizer.
This combination allows for learning a sparse model where few of the weights are non-zero like Lasso, while still maintaining the regularization properties of Ridge. Can control the convex combination of L1 and L2 using a ratio parameter.
Elastic-net is useful when there are multiple features which are correlated with one another. Lasso is likely to pick one of these at random, while elastic-net is likely to pick both.
A practical advantage of trading-off between Lasso and Ridge is it allows Elastic-Net to inherit some of Ridge’s stability under rotation.
$min_w \frac{1} {2n} ||Xw-y||_2^2 + \alpha\rho||w||_1 + \frac{\alpha(1-\rho)} {2}||w||_2^2$
In [ ]:
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.5)
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model, mse, r2))
In [ ]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
alphas = np.logspace(-10, -2, 200)
ridge = RidgeCV(alphas=alphas)
lasso = LassoCV(alphas=alphas)
elnet = ElasticNetCV(alphas=alphas)
for model in (ridge, lasso, elnet):
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nAlpha: {:0.3f} | MSE: {:0.3f} | R2: {:0.3f}".format(model, model.alpha_, mse, r2))
In [ ]:
clf = Ridge(fit_intercept=False)
errors = []
for alpha in alphas:
splits = tts(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = splits
clf.set_params(alpha=alpha)
clf.fit(X_train, y_train)
error = mean_squared_error(y_test, clf.predict(X_test))
errors.append(error)
axe = plt.gca()
axe.plot(alphas, errors)
In order to do higher order polynomial regression, we can use linear models trained on nonlinear functions of data!
The way this works is via Pipelining.
Consider the standard linear regression case:
$\hat{y}(w,x) = w_0 + \sum_i^n{w_ix_i}$
The quadratic case (polynomial degree = 2) is:
$\hat{y}(w,v,x) = w_0 + \sum_i^n{w_ix_i} + \sum_i^n{v_ix_i^2}$
But this can just be seen as a new feature space:
$z = [x_1,...,x_n,x_1^2,...,x_n^2]$
And this feature space can be computed in a linear fashion. We just need some way to add our 2nd degree dimensions.
In [ ]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model = Pipeline([
('poly', PolynomialFeatures(2)),
('ridge', RidgeCV(alphas=alphas)),
])
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model, mse, r2))
In [ ]:
ENERGY = "http://archive.ics.uci.edu/ml/machine-learning-databases/00242/ENB2012_data.xlsx"
In [ ]:
def download_data(url, path='data'):
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url)
name = os.path.basename(url)
with open(os.path.join(path, name), 'wb') as f:
f.write(response.content)
In [ ]:
download_data(ENERGY)
In [ ]:
energy = pd.read_excel('data/ENB2012_data.xlsx', sep=",")
energy.columns = ['compactness','surface_area','wall_area','roof_area','height',\
'orientation','glazing_area','distribution','heating_load','cooling_load']
In [ ]:
energy.head()
In [ ]:
energy.describe()
In [ ]:
from pandas.tools.plotting import scatter_matrix
ax = scatter_matrix(energy, alpha=0.2, figsize=(9,9), diagonal='kde')
In [ ]:
energy_features = energy.ix[:,0:8]
energy_labels = energy.ix[:,8:]
In [ ]:
from sklearn.linear_model import RandomizedLasso
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["heating_load"])
names = list(energy_features)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), model.scores_),
names), reverse=True))
In [ ]:
model = RandomizedLasso(alpha=0.1)
model.fit(energy_features, energy_labels["cooling_load"])
names = list(energy_features)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), model.scores_),
names), reverse=True))
In [ ]:
heat_labels = energy.ix[:,8]
In [ ]:
def fit_and_evaluate(model, X, y):
X_train, X_test, y_train, y_test = tts(X, y)
model.fit(X_train, y_train)
mse = mean_squared_error(y_test, model.predict(X_test))
r2 = model.score(X_test, y_test)
print("{}\nMSE: {:0.3f} | R2: {:0.3f}".format(model, mse, r2))
In [ ]:
from sklearn.ensemble import RandomForestRegressor
models = [
LinearRegression(),
RidgeCV(alphas=alphas),
LassoCV(alphas=alphas),
ElasticNetCV(alphas=alphas),
RandomForestRegressor(),
]
for model in models:
fit_and_evaluate(model, energy_features, heat_labels)